import altair as alt
import eland as ed
import json
import numpy as np
import pandas as pd
from datetime import date
from elasticsearch import Elasticsearch
alt.data_transformers.disable_max_rows()
DataTransformerRegistry.enable('default')
df = pd.read_csv('../data/tirol_obituaries_deduped_weekly.csv', delimiter = ',')
df.tail()
| district | municipaly | year | week | count | |
|---|---|---|---|---|---|
| 57611 | Innsbruck-Stadt | Innsbruck | 2020 | 48 | 7 |
| 57612 | Innsbruck-Stadt | Innsbruck | 2020 | 49 | 4 |
| 57613 | Innsbruck-Stadt | Innsbruck | 2020 | 50 | 9 |
| 57614 | Innsbruck-Stadt | Innsbruck | 2020 | 51 | 11 |
| 57615 | Innsbruck-Stadt | Innsbruck | 2020 | 52 | 1 |
#http://www.statistik.at/wcm/idc/idcplg?IdcService=GET_NATIVE_FILE&RevisionSelectionMethod=LatestReleased&dDocName=122943
statistik_austria = pd.DataFrame({
'week': [52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1],
'death_count': [144,175,169,176,205,197,168,138,133,124,125,116,117,120,114,126,131,125,114,109,137,109,93,108,98,111,102,100,91,101,101,111,98,98,97,114,122,156,155,157,137,131,128,137,123,117,120,132,133,138,137,123]
})
statistik_austria.head()
| week | death_count | |
|---|---|---|
| 0 | 52 | 144 |
| 1 | 51 | 175 |
| 2 | 50 | 169 |
| 3 | 49 | 176 |
| 4 | 48 | 205 |
Running ./bin/parse.sh will create the source file for outlier detection: ./data/tirol_obituaries_deduped_weekly_outlier_detection_features.csv.
The columns of this file are: district,municipaly,year,week,count,yearly_max,weekly_max
yearly_max and weekly_max are the derived feature vectors. They are the difference between each rows count and the municipaly's maximum value of the row's week and the yearly maximum of that municipaly.
To calculate the outlier_score for each row, use the following configuration with Elasticsearch's Outlier Detection feature:
{
"id": "tirol_outlier_high_count_2_1",
"description": "",
"source": {
"index": [
"tirol_outlier_high_count_2"
],
"query": {
"match_all": {}
}
},
"dest": {
"index": "tirol_outlier_high_count_dest_2_1",
"results_field": "ml"
},
"analysis": {
"outlier_detection": {
"compute_feature_influence": true,
"outlier_fraction": 0.05,
"standardization_enabled": true
}
},
"analyzed_fields": {
"includes": [],
"excludes": [
"count",
"week",
"year"
]
},
"model_memory_limit": "2mb",
"create_time": 1587934690989,
"version": "8.0.0",
"allow_lazy_start": false
}
To learn more about the feature head over to the docs here.
# with open('elasticsearch-host', 'r') as file:
# es_host = file.read().replace('\n', '')
# ed_outliers = ed.read_es(es_host, 'tirol_outlier_high_count_dest_3_1')
# ed_outliers = ed_outliers[['district', 'municipaly', 'year', 'week', 'ml.outlier_score']]
# df_outliers = ed.eland_to_pandas(ed_outliers)
# df_outliers = df_outliers.rename(columns={'ml.outlier_score': 'outlier_score'})
# df_outliers.to_csv('../data/tirol_obituaries_outliers_weekly.csv', index=False)
# df_outliers.head()
# alt.Chart(df_outliers, width=120, height=120).mark_bar().encode(
# alt.X("outlier_score:Q", bin=alt.Bin(maxbins=10)),
# y='count()',
# tooltip=['count()', 'outlier_score']
# )
# # A dropdown filter
# split = 'district'
# outlier_distribution = alt.Chart(df_outliers, width=300)
# points = outlier_distribution.mark_circle(opacity=0.9,size=20).encode(
# x='outlier_score:Q',
# y=split+':N',
# tooltip=['district', 'municipaly', 'week', 'outlier_score']
# )
# error_bars = outlier_distribution.mark_rule(opacity=0.2,strokeWidth=10).encode(
# x='ci0(outlier_score)',
# x2='ci1(outlier_score)',
# y=split+':N',
# )
# median_bars = outlier_distribution.mark_point(
# opacity=0.8,strokeWidth=1,size=200,shape='stroke',color='black',angle=90
# ).encode(
# x='median(outlier_score)',
# y=split+':N',
# )
# box_plot = outlier_distribution.mark_boxplot(outliers=False,median=False,opacity=0.5).encode(
# x='outlier_score:Q',
# y='district:N'
# )
# (box_plot + median_bars + points)
# alt.Chart(df_outliers, width=600).mark_boxplot().encode(
# x='outlier_score:Q'
# )
# d_both = pd.merge(df, df_outliers, how='left', on=['district', 'municipaly', 'year', 'week'])
# d_both.head()
base = alt.Chart(df, width=470, height=200)
x_domain=(1,52)
# we don't need that anymore since we're in 2021 now
#week_cut_off=str(date.today().isocalendar()[1]-2)
week_cut_off='53'
week_point = base.transform_filter(
'datum.year==2020 & datum.week<'+week_cut_off
).mark_circle(color='red', opacity=0, size=150).encode(
alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
alt.Y('sum(count)', title=''),
tooltip=[
alt.Tooltip('week:O', title="Kalenderwoche"),
alt.Tooltip('sum(count):Q', title="Todesanzeigen")
]
)
week_historic_area = base.transform_filter(
'datum.year!=2021 && datum.year!=2020 && datum.year!=2016'
).transform_joinaggregate(
week_count='sum(count)',
groupby=['year', 'week']
).mark_area(color='lightgray', opacity=0.5, strokeJoin='round').encode(
alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
alt.Y('max(week_count):Q', title=''),
alt.Y2('min(week_count):Q', title='')
)
week_historic_mean = base.transform_filter(
'datum.year!=2021 && datum.year!=2020 && datum.year!=2016'
).transform_joinaggregate(
week_count='sum(count)',
groupby=['year', 'week']
).mark_line(color='gray', opacity=0.25, strokeJoin='round').encode(
alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
alt.Y('mean(week_count):Q', title='')
)
week_current_line = base.transform_filter(
'datum.year==2020 & datum.week<'+week_cut_off
).mark_line(strokeJoin='round').encode(
alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
alt.Y('sum(count):Q', title='', axis=alt.Axis(domainOpacity=0,ticks=False))
)
week_statistik_austria_line = alt.Chart(statistik_austria).mark_line(clip=True,color='orange', strokeJoin='round').encode(
alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
alt.Y('death_count:Q', title='', axis=alt.Axis(domainOpacity=0,ticks=False)),
tooltip=[
alt.Tooltip('week:O', title="Kalenderwoche"),
alt.Tooltip('death_count:Q', title="Todesfälle")
]
)
week_statistik_austria_line_80 = alt.Chart(statistik_austria).transform_calculate(
death_count_adjusted='datum.death_count*0.8'
).mark_line(clip=True,color='#ffbb78', strokeJoin='round').encode(
alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
alt.Y('death_count_adjusted:Q', title='', axis=alt.Axis(domainOpacity=0,ticks=False)),
tooltip=[
alt.Tooltip('week:O', title="Kalenderwoche"),
alt.Tooltip('death_count_adjusted:Q', title="Todesfälle 80%")
]
)
chart_3_legendDomain = [
'Statistik Austria 2020',
# 'Statistik Austria 2020 80%',
'Todesanzeigen 2020',
'Todesanzeigen 2017-2019 min/max',
'Todesanzeigen 2017-2019 mean'
]
chart_3_legendColors = [
'orange',
# '#ffbb78',
'#1f77b4',
'lightgray',
'gray'
]
chart_3_legendData = pd.DataFrame({
'label': chart_3_legendDomain
})
legend3 = alt.Chart(chart_3_legendData).mark_square(size=150).encode(
y=alt.Y(
'label:N',
axis=alt.Axis(domain=False, ticks=False, orient='right'),
title=None
),
color=alt.Color(
'label',
scale=alt.Scale(domain=chart_3_legendDomain,range=chart_3_legendColors),
legend=None
)
)
vega_week = ((
week_historic_area +
week_historic_mean +
week_statistik_austria_line +
# week_statistik_austria_line_80 +
week_current_line +
week_point
) | legend3).properties(
title={
"text": ["Todesanzeigen Tirol pro Woche im Jahresverlauf"],
}
).configure_axis(
grid=True,
gridColor="#eee",
domainColor="#ddd",
tickColor="#ddd",
labelColor="gray",
labelBound=True,
).configure_view(
strokeWidth=0
).configure_title(
fontSize=14,
fontWeight='bold',
anchor='start',
color='gray'
)
vega_week
def district_chart(district='Kufstein'):
d_df = df[df['district']==district]
d_base = alt.Chart(d_df).properties(
width=220,
height=125
)
x_domain=(1,52)
y_domain=(0,40)
d_week_historic_area = d_base.transform_filter(
'datum.year!=2021 && datum.year!=2020 && datum.year!=2016'
).mark_area(color='lightgray', opacity=0.5, strokeJoin='round').transform_joinaggregate(
week_count='sum(count)',
groupby=['year', 'week']
).encode(
alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
alt.Y('max(week_count):Q', title=''),
alt.Y2('min(week_count):Q', title='')
)
d_week_historic_mean = d_base.transform_filter(
'datum.year!=2021 && datum.year!=2020 && datum.year!=2016'
).mark_line(color='gray', opacity=0.25, strokeJoin='round').transform_joinaggregate(
week_count='sum(count)',
groupby=['year', 'week']
).encode(
alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
alt.Y('mean(week_count):Q', title='')
)
d_week_current_line = d_base.transform_filter(
'datum.year==2020 & datum.week<'+week_cut_off
).mark_line(strokeJoin='round').encode(
alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
alt.Y('sum(count):Q', title='', axis=alt.Axis(domainOpacity=0,ticks=False))
)
d_week_point = d_base.transform_filter(
'datum.year==2020 & datum.week<'+week_cut_off
).mark_circle(color='gray', opacity=0, size=150).encode(
alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
alt.Y('sum(count):Q', scale=alt.Scale(domain=y_domain,zero=True,nice=False), title=''),
tooltip=[
alt.Tooltip('week:O', title="Kalenderwoche"),
alt.Tooltip('sum(count):Q', title="Todesanzeigen")
]
)
return (
d_week_historic_area +
d_week_historic_mean +
d_week_current_line +
d_week_point
).properties(
title={
"text": [district],
}
)
d_chart_3_legendDomain = [
'Todesanzeigen 2020',
'Todesanzeigen 2017-2019 min/max',
'Todesanzeigen 2017-2019 mean'
]
d_chart_3_legendColors = ['#1f77b4', 'lightgray', 'gray']
d_chart_3_legendData = pd.DataFrame({
'label': d_chart_3_legendDomain
})
d_legend3 = alt.Chart(d_chart_3_legendData).mark_square(size=150).encode(
y=alt.Y(
'label:N',
axis=alt.Axis(domain=False, ticks=False, orient='right'),
title=None
),
color=alt.Color(
'label',
scale=alt.Scale(domain=d_chart_3_legendDomain,range=d_chart_3_legendColors),
legend=None
)
)
vega_week_district = (
(
district_chart('Imst') | district_chart('Innsbruck-Stadt') | district_chart('Innsbruck-Land')
) & (
district_chart('Kitzbühel') | district_chart('Kufstein') | district_chart('Landeck')
) & (
district_chart('Lienz') | district_chart('Reutte') | district_chart('Schwaz')
) | d_legend3
).configure_axis(
grid=True,
gridColor="#eee",
domainColor="#ddd",
tickColor="#ddd",
labelColor="gray",
labelBound=True,
).configure_view(
strokeWidth=0
).configure_title(
fontSize=14,
fontWeight='normal',
anchor='start',
color='gray'
)
vega_week_district
d_both = df
def municipaly_chart(district='Kufstein'):
d_both_district = d_both[d_both['district']==district]
d_base = alt.Chart(d_both_district).properties(
width=150,
height=60
)
x_domain=(1,52)
y_domain=(0,20)
outlier_threshold='0.5'
d_week_historic_area = d_base.transform_filter(
'datum.year!=2021 && datum.year!=2020 && datum.year!=2016'
).transform_joinaggregate(
week_count='sum(count)',
groupby=['week','year']
).transform_calculate(
week_count_min='min(datum.week_count)',
week_count_max='max(datum.week_count)'
).mark_area(fill='lightgray', opacity=.35, strokeJoin='round', strokeWidth=1, stroke='lightgray', strokeOpacity=0.35).encode(
alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
alt.Y('max(week_count):Q', title=''),
alt.Y2('min(week_count):Q', title='')
)
d_week_historic_mean = d_base.transform_filter(
'datum.year!=2021 && datum.year!=2020 && datum.year!=2016'
).transform_joinaggregate(
week_count='sum(count)',
groupby=['week','year']
).mark_line(color='gray', opacity=0.25, strokeJoin='round').encode(
alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
alt.Y('mean(week_count):Q', title='')
)
d_week_current_line = d_base.transform_filter(
'datum.year==2020 & datum.week<'+week_cut_off
).transform_joinaggregate(
week_count='sum(count)',
groupby=['week','year']
).mark_line(strokeJoin='round').encode(
alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
alt.Y('week_count:Q', title='', axis=alt.Axis(domainOpacity=0,ticks=False))
)
d_week_point = d_base.transform_filter(
'datum.year==2020 & datum.week<'+week_cut_off
).mark_circle(color='gray', opacity=0, size=150).encode(
alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
alt.Y('sum(count):Q', scale=alt.Scale(zero=True), title=''),
tooltip=[
alt.Tooltip('week:O', title="Kalenderwoche"),
alt.Tooltip('sum(count):Q', title="Todesanzeigen"),
alt.Tooltip('outlier_score:Q', title="Outlier-Score", format='.2%')
]
)
d_week_outlier = d_base.transform_filter(
'datum.year==2020 & datum.week<'+week_cut_off+' & datum.outlier_score>='+outlier_threshold
).transform_joinaggregate(
week_count='sum(count)',
outlier_score_max='max(outlier_score)',
groupby=['week','year']
).mark_point(color='red', opacity=.4, size=60, strokeWidth=3).encode(
alt.X('week:Q', scale=alt.Scale(domain=x_domain,zero=False,nice=False), title='', axis=alt.Axis(grid=False)),
alt.Y('week_count:Q', scale=alt.Scale(zero=True), title='')
)
return (
d_week_historic_area +
d_week_historic_mean +
d_week_current_line +
d_week_outlier +
d_week_point
).facet(
facet=alt.Facet('municipaly:N', title=None, header=alt.Header(
labelAnchor='start',
labelOrient='bottom',
labelPadding=0
)),
columns=5,
padding=0,
title={
"text": ["Todesanzeigen im Bezirk "+district+" pro Gemeinde pro Woche im Jahresverlauf"],
"subtitle": [
# "Durchschnitt pro Monat in Hellblau",
"Quelle: https://github.com/walterra/covid-19-tirol-ds"
],
"color": "black",
"subtitleColor": "gray"
},
).configure_axis(
grid=True,
gridColor="#eee",
domainColor="#ddd",
tickColor="#ddd",
labelColor="gray",
labelBound=True,
).configure_view(
strokeWidth=0
).configure_title(
fontSize=14,
fontWeight='bold',
anchor='start',
color='gray'
)
d_chart_3_legendDomain = [
'Todesanzeigen 2020',
'Todesanzeigen 2017-2019 min/max',
'Todesanzeigen 2017-2019 mean'
]
d_chart_3_legendColors = ['#1f77b4', 'lightgray', 'gray']
d_chart_3_legendData = pd.DataFrame({
'label': d_chart_3_legendDomain
})
d_legend3 = alt.Chart(d_chart_3_legendData).mark_square(size=150).encode(
y=alt.Y(
'label:N',
axis=alt.Axis(domain=False, ticks=False, orient='right'),
title=None
),
color=alt.Color(
'label',
scale=alt.Scale(domain=d_chart_3_legendDomain,range=d_chart_3_legendColors),
legend=None
)
)
vega_week_municipaly = municipaly_chart('Landeck')
vega_week_municipaly
# save chart_1 as formatted JSON
with open("../docs/data/vega_week.json", "w") as jsonFile:
json.dump(json.loads(vega_week.to_json()), jsonFile, indent=4, sort_keys=True)
# save chart_1 as image
vega_week.save("../assets/vega_week.png")
# save chart_1 as formatted JSON
with open("../docs/data/vega_week_district.json", "w") as jsonFile:
json.dump(json.loads(vega_week_district.to_json()), jsonFile, indent=4, sort_keys=True)
# save chart_2 as image
vega_week_district.save("../assets/vega_week_district.png")
# save chart_3 as image
vega_week_municipaly.save("../assets/vega_week_municipaly.png")
districts = {
"imst": "Imst",
"innsbruck-land": "Innsbruck-Land",
"innsbruck-stadt": "Innsbruck-Stadt",
"kitzbuehel": "Kitzbühel",
"kufstein": "Kufstein",
"landeck": "Landeck",
"reutte": "Reutte",
"schwaz": "Schwaz",
"lienz": "Lienz"
}
for d in districts:
chart = municipaly_chart(districts[d])
with open("../docs/data/vega_" + d + ".json", "w") as jsonFile:
json.dump(json.loads(chart.to_json()), jsonFile, indent=4, sort_keys=True)